/* calculate main skewness measure and top and bottom skewness measures, with health majors */

libname edu '!userprofile\\Dropbox\Education\Replication\Data';

DATA crsp;  /*read all the crsp monthly data with nonmissing return*/
set edu.crsp_data;
IF not missing (RET);
if RET ne 'C';
if RET ne 'B';
year=year(date);
month=month(date);
if 1<=EXCHCD<=3 and SHRCD in (10 11);
naics_3=compress(substr(naics,1,3));
sic_2=floor(siccd/100);
mktcap = abs(prc)*shrout;
run;

/*annualize monthly returns*/

proc sql noprint;
create table annualReturns as
select *,
  count(ret) as nbMonths,
  exp(sum(log(1+ret))) as cumulativeReturn
from crsp
group by permno, year;
select * from annualReturns;
quit;

proc sort data = annualreturns; by permno year month; run;

data annualreturns;
set annualreturns;
by permno year month;
if last.year;
if  nbmonths>=6; /* requires 6 observations in a year */
ann_ret=cumulativeReturn-1;
run;

data annualreturns;
  set annualreturns;
  by permno year;
  lagmktcap = lag(mktcap);
  if first.permno then lagmktcap = .;
run;

/* get # employees, if there are duplicate entries, choose the larger and the most recent */
data emp (keep = lpermno datadate year emp); set edu.crspcompustat_data; year = year(datadate); run; 

proc sort data = emp; by lpermno year descending datadate descending emp; run;
proc sort data = emp nodupkey; by lpermno year; run;

proc sql;
  create table annualreturns as
  select distinct a.*, b.emp
  from annualreturns as a left join emp as b
  on a.permno = b.lpermno and a.year = b.year;
quit;

/* for each permco, only the permno that has the largest market cap will be counted (get rid of multiple share classes) */
proc sort data = annualreturns;
  by permco year descending mktcap;
run;

proc means data = annualreturns noprint;
  by permco year;
  var lagmktcap;
  output out = size sum = lagmktcap;
run;

data annualreturns (drop = lagmktcap);
  set annualreturns;
  by permco year;
  if first.year;
run;

proc sql;
  create table annualreturns as
  select a.*, b.lagmktcap
  from annualreturns as a, size as b
  where a.permco = b.permco and a.year = b.year;
quit;

/* map SIC2 and NAICS3 into major code (use SIC2 before 2005; NAICS3 afterwards) */
proc sql;
  create table temp as
  select a.*, b.major 
  from annualreturns as a left join edu.sic_2_major_bio as b
  on a.sic_2 = b.sic_2;
quit;

data temp;
  set temp;
  if year >= 2005 then delete;
run;

data temp2; set annualreturns; naics3 = input(naics_3, 12.); run;

proc sql;
  create table temp2 as
  select a.*, b.major 
  from temp2 as a left join edu.naics_3_major_bio as b
  on a.naics3 = b.naics_3;
quit;

data temp2 (drop = naics3);
  set temp2;
  if year >= 2005;
run;

data annualreturns_major; set temp; run;
proc append base = annualreturns_major data = temp2; run;

/* calculate main skewness measure */
data annualreturns2;
set annualreturns_major;
if not missing (major);
run;

proc sort data = annualreturns2;
by year major;
run; 

proc univariate data = annualreturns2 vardef = df noprint;
by year major;
var ann_ret;
weight emp;
output out=skewness_2e skew=skew1e;
run;

data edu.industry_skew_major_bio;
set skewness_2e;
run;
